clear all
set more off
capture log close
log using 2Tables, replace text
set matsize 750

use hans_file2

*Match father for family
egen famid = group(fpid) 
gsort famid -age
quietly by famid: gen birthorder=_n if famid!=.
rename p_se parentr
gen byte daddied = faar>p_dy & p_dy!=.

gen lpay = ln(pearn)
gen lasset = ln(bruttoform)
drop if lpay==.
gen missasset = lasset==.
replace lasset = 0 if lasset==.
label var lpay "Log of highest earnings 1996-1998"

* Relatedness measures
gen relate1 = r1_==10
gen relate2 = r2_==10
gen relate3 = r3_==10
replace relate1=0 if relate1==.
replace relate2=0 if relate2==.
replace relate3=0 if relate3==.
summ relate*

* Make default "same" same 5-digit industry as father
rename naceSSB fi5
gen fi2=int(fi5/1000)
rename same5_s same
rename same2_s same2

* Generate work industry
rename nace i5work
rename nace_2 i2work
gen same2_pre = i2work==fi2
gen same5_pre = i5work==fi5

* Entrepreneur summary statistics
summ sex share age educ same5_* pearn bruttoform parentr
drop if sex==2
drop if pearn==. | bruttoform==. | age==. | educ ==. 
drop if pearn==0

* Table 1 details for entrepreneurs
summ sex share age educ same same2 relate3 pearn bruttoform parentr

replace employees_4 = 0 if employees_4 == .
rename sumaksje equity_0
rename sumeiend_4 assets_4
rename salgsinn_4 sales_4

* Tech definition
gen n5= naceSSB_orig
gen n4=int(n5/10)
gen n3=int(n5/100)
gen n2=int(n5/1000) 

** hi tech
gen hi_tech=(n3==244|n2==30|n2==32|n2==33|n3==353)
tab hi_tech, m

** medium to high tech
gen medium_hi_tech=(n2==29|n2==31|n2==34)
replace medium_hi_tech=1 if n2==24 & n3!=244
replace medium_hi_tech=1 if n2==35 & n3!=351 & n3!=353

** medium to low tech. NB here we add oil and gas nace 11!!!!
gen medium_lo_tech=(n3>=23&n3<=28) & n3!=24 //nace==11, oil and gas, 23 coke refined petroleum,  24 chemicals, 27 metals, 28 mechanical, 29 machinery, 30 office equip and computers, 31 electrical, 32 radio television communication, 33 medical instruments,  34 motor vehicles, 72 IT
replace medium_lo_tech=1 if n2==11

** hi_tech knowledge intensive
replace hi_tech=1 if n2==64|n2==72|n2==73

** low tech
gen lo_tech=0
replace lo_tech=1 if hi_tech==0 & medium_hi_tech==0 & medium_lo_tech==0

** broad tech category
gen tech=0
replace tech=1 if lo_tech==0
tab tech

* merge in variable indicating if entrepreneur ever works at firm
sort pid
merge pid using works_sometime
tab _merge
drop if _merge!=3
drop _merge

*Summary Statistics for Ventures (Table 1, Panel B)
summ equity survive4 assets_4 sales_4 employees_4 ebitda_4 roa_4 tech
summ equity survive4 assets_4 sales_4 employees_4 ebitda_4 roa_4 tech if same2
summ equity survive4 assets_4 sales_4 employees_4 ebitda_4 roa_4 tech if !same2

save tableNoIQdata, replace

clear
use tableNoIQdata

set seed 123
replace max_empl=0 if max_empl==.
sort stiftetorgnr
replace max_empl=max_empl+uniform()/1000
xtile empl_deciles=max_empl, nquant(10)
tab empl_, m
replace empl_deciles=5 if max_empl<=1.2
tab empl_d
gen lsales=ln(sales_4)
gen inter=same*empl_decile
xi i.inter, noomit
drop _Iinter_0
reg lsales i.empl_decile _Iinter* i.yr age i.educ logbrform* logequity i.fi5 parentr, noc
* Figure 4
coefplot, keep(_Iinter*) vertical xlabel(1 "5" 2 "6" 3 "7" 4 "8" 5 "9" 6 "10") xtitle("Deciles of initial employment") ytitle("Same coefficient on sales") note("Any firm with no more than one employee is in the fifth decile.")
graph export sizedecile.png, replace

clear
use tableNoIQdata

* Creates Table 2
sort fi2
quietly by fi2: gen fi2count = _N
egen fi2same = mean(same2), by(fi2)
egen fi2educ = mean(educ), by(fi2)
replace survive4 = . if same2==1
egen fi2surv = mean(survive4), by(fi2)
replace equity = . if same2==1
replace equity = ln(equity)
egen fi2equity = mean(equity), by(fi2)
drop if fi2==fi2[_n-1]
sort fi2
keep fi2 fi2count fi2educ fi2same fi2surv fi2equity
sort fi2same
drop if fi2count<100
binscatter fi2same fi2ed [fw=fi2c], xtitle("Average education for founders in industry") ytitle("Fraction entrepreneurs following fathers (2-digit industry)")
graph export industrySameEduc.png, replace
binscatter fi2surv fi2same [fw=fi2c], ytitle("4-year survival rate in industry") xtitle("Fraction entrepreneurs following fathers (2-digit industry)")
graph export industrySameSurvive.png, replace
binscatter fi2equity fi2same [fw=fi2c], ytitle("Industry average log initial equity invested") xtitle("Fraction entrepreneurs following fathers (2-digit industry)")
graph export industrySameEquity.png, replace
list
save fi2, replace

clear
use tableNoIQdata

* First regression defines it as same 5-digit industry
reg same educ age lasset missasset parentr
xi: reg same educ age lasset missasset parentr, absorb(fi5)

** PLEASE NOTE: TABLE 3 IS GENERATED AT BOTTOM.

*****************************
* Performance Table       ***
*****************************
gen leq = ln(equity_0)
replace leq=0 if leq==.

* Regressions for Table 4
xi: reg leq same educ age lasset missasset parentr i.faar, cluster(fi5)
xi: areg leq same educ age lasset missasset i.faar parentr, absorb(fi5) cluster(fi5)

* Regressions for Table 5
rename survive4 survive
tab survive same
xi: reg survive educ age same lasset missasset leq parentr i.faar, cluster(fi5)
xi: areg survive educ age same i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
gen lsales = ln(sales_4)
xi: reg lsales educ age same lasset missasset leq parentr i.faar, cluster(fi5)
xi: areg lsales educ age same i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: reg employees_4 educ age same lasset missasset leq parentr i.faar, cluster(fi5)
xi: areg employees_4 educ age same i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
gen lass4=ln(assets_4)
xi: reg lass4 educ age same lasset missasset leq parentr i.faar, cluster(fi5)
xi: areg lass4 educ age same i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: reg ebitda_4 educ age same lasset missasset leq parentr i.faar, cluster(fi5)
xi: areg ebitda_4 educ age same i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)

* Below regressions for various Appendix tables...
xi: areg survive educ age i.daddied*same i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg survive educ age same i.faar lasset missasset leq if daddied, absorb(fi5) cluster(fi5)
xi: areg survive educ age same2 i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: areg survive educ age same same2 relate3 i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg survive educ age i.same*i.parentr i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg survive educ age same i.faar lasset missasset leq if !parentr, absorb(fi5) cluster(fi5)

xi: areg lsales educ age i.daddied*same i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lsales educ age same i.faar lasset missasset leq if daddied, absorb(fi5) cluster(fi5)
xi: areg lsales educ age same2 i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: areg lsales educ age same same2 relate3 i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lsales educ age i.same*i.parentr i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lsales educ age same i.faar lasset missasset leq if !parentr, absorb(fi5) cluster(fi5)

* Create Figure 5 (See below for figures 2, 3, and 4)
replace lsales=0 if lsales==.
set scheme s2mono
twoway (kdensity lsales if same, color(green)) (kdensity lsales if !same), legend(order(1 "Followers" 2 "Non-Followers" )) xtitle("Distribution of Log Sales After 4 Years (Zero if out of business)") ytitle("Frequency")
graph export sales4dist.png, replace
replace lsales=. if lsales==0

xi: areg employees_4 educ age i.daddied*same i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg employees_4 educ age same i.faar lasset missasset leq if daddied, absorb(fi5) cluster(fi5)
xi: areg employees_4 educ age same2 i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: areg employees_4 educ age same same2 relate3 i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg employees_4 educ age i.same*i.parentr i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg employees_4 educ age same i.faar lasset missasset leq if !parentr, absorb(fi5) cluster(fi5)

xi: areg lass4 educ age i.daddied*same i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lass4 educ age same i.faar lasset missasset leq if daddied, absorb(fi5) cluster(fi5)
xi: areg lass4 educ age same2 i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: areg lass4 educ age same same2 relate3 i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lass4 educ age i.same*i.parentr i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg lass4 educ age same i.faar lasset missasset leq if !parentr, absorb(fi5) cluster(fi5)

xi: areg ebitda_4 educ age i.daddied*same i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg ebitda_4 educ age same i.faar lasset missasset leq if daddied, absorb(fi5) cluster(fi5)
xi: areg ebitda_4 educ age same2 i.faar lasset missasset leq parentr, absorb(fi5) cluster(fi5)
xi: areg ebitda_4 educ age same same2 relate3 i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg ebitda_4 educ age i.same*i.parentr i.faar lasset missasset leq, absorb(fi5) cluster(fi5)
xi: areg ebitda_4 educ age same i.faar lasset missasset leq if !parentr, absorb(fi5) cluster(fi5)

* Create Table 6 Outlier Analysis
replace employees_4=0 if employees_4==.
replace assets_4=0 if assets_4==.
replace sales_4=0 if sales_4==.
replace roa_4=0 if roa_4==.
replace ebitda_4=0 if ebitda_4==.

* Now look at outliers controlling for industry
sort fi2
quietly by fi2: drop if _N<20
sort fi2 employees_4
quietly by fi2: gen byte emp95 = _n/_N>=.95
sort fi2 assets_4
quietly by fi2: gen byte asset95 = _n/_N>=.95
sort fi2 sales_4
quietly by fi2: gen byte sales95 = _n/_N>=.95
sort fi2 ebitda_4
quietly by fi2: gen byte ebit95 = _n/_N>=.95

* Table 6
xi: reg sales95 same educ age lasset missasset leq parentr i.faar
xi: reg emp95 same educ age lasset missasset leq parentr i.faar
xi: reg asset95 same educ age lasset missasset leq parentr i.faar
xi: reg ebit95 same educ age lasset missasset leq parentr i.faar

clear
* Data with IQs is in "tabledata.dta"
use tabledata

* Summary Statistics (Only IQ used in paper)
summ share age educ iq same same2 pearn brform
summ share age educ iq same same2 samepre pearn brform mdist ddist mentr dentr parentr if entrep == 0
summ share age educ iq same same2 samepre pearn brform mdist ddist mentr dentr parentr if entrep == 1

* Table 3
keep if entrep==1
reg same iq, cluster(fi5)
reg same iq educ age lasset missasset parentr, cluster(fi5)
xi: reg same iq educ age lasset missasset parentr, absorb(fi5) cluster(fi5)
xi: reg same iq educ age lasset missasset, absorb(famid)

* Figure 2
graph bar (mean) same2 if iq>2 & daddi, over(iq, gap(100)) ytitle("Fraction in Father's Industry") b1title("IQ")
graph export samedead.png, replace

* Figure 3
rename fi2 i2
sort i2
egen i2same = mean(same2), by(i2)
keep i2 same2
sort i2
drop if i2==i2[_n-1]
merge 1:1 i2 using i2data
tab _m
drop _m
sort i2
label var wi2iq "Average IQ for employees in industry"
label var wi2same "Fraction entrepreneurs following fathers (2-digit industry)"
binscatter wi2s wi2i [fw=wi2c], xtitle("Average IQ for employees in industry") ytitle("Fraction entrepreneurs following fathers (2-digit industry)")
graph export industrySameIQ.png, replace

* Figure 4
clear
use tableNoIQdata

set seed 123
replace max_empl=0 if max_empl==.
sort stiftetorgnr
replace max_empl=max_empl+uniform()/1000
xtile empl_deciles=max_empl, nquant(10)
tab empl_, m
replace empl_deciles=5 if max_empl<=1.2
tab empl_d
gen lsales=ln(sales_4)
gen inter=same*empl_decile
xi i.inter, noomit
drop _Iinter_0
reg lsales i.empl_decile _Iinter* i.yr age i.educ logbrform* logequity i.fi5 parentr, noc
coefplot, keep(_Iinter*) vertical xlabel(1 "5" 2 "6" 3 "7" 4 "8" 5 "9" 6 "10") xtitle("Deciles of initial employment") ytitle("Same coefficient on sales") note("Any firm with no more than one employee is in the fifth decile.")
graph export sizedecile.png, replace

*****************************************************************************************************
** Graph: discontinuity at exact same industry code for all three defs of relatedness DEAD FATHERS **
*****************************************************************************************************
forvalues x=1/3 {
use hans_file2, clear  //stiftetorgnr pid share faar naceSSB p_nace5 using
keep if f_dead==1
capture drop N
capture drop f1
capture drop f2
capture drop f3
drop if naceSSB==.|naceSSB==0
drop if p_nace5==.|p_nace5==0

rename naceSSB nace_startup
rename p_nace5 nace_father

** import most related industry according to labor transition matrix
sort nace_father 

merge nace_father using top_sectors_f`x', nokeep keep(nace_startup*)
tab _merge
drop _merge

** new variables
global listx "nace_startup nace_father"
foreach v of global listx {
gen `v'_5=`v'
gen `v'_4=floor(`v'/10)
gen `v'_3=floor(`v'/100)
gen `v'_2=floor(`v'/1000)
gen `v'_1=floor(`v'/10000)
}

** same x-digit industry
forvalues t=1/5 {
gen same`t'=nace_startup_`t'==nace_father_`t'
gen notsame`t'=1-same`t'
}

** same at 5-digit level for related industries
forvalues t=1/100 {
gen same_close_`t'=nace_startup==nace_startup`t'
tab same_close_`t', m
label var same_close_`t' `t'
}

** Take data to long (graph bar does not work that well)
** this means one row per same_close dummmy
rename nace_startup nace_startupx
rename same5 same_close_0
reshape long nace_startup same_close_, i(pid)
save temp, replace

** This graph works but hard to customize with graph bar
graph bar (mean) same_close, over(_j) legend(off) ytitle("Entrepreneurs per hundred") b1title("Distance from father's industry") title("Startup rates by distance from father's industry") subtitle("Dead father subsample")
*graph export freq_graph_bar_f`x'_DEAD.png, replace as(png)
graph export freq_graph_bar_DEAD.png, replace as(png)

use temp, clear
replace _j=_j+1
drop if _j==101
label var _j "Distance from father's industry"
replace same_close=same_close*1000  //n per thousand instead of percent
binscatter same_close _j, line(connect) xq(_j) ytitle("Entrepreneurs per thousand") xtitle("Distance from father's industry") msymbol(o) //subtitle("Dead father subsample") //title("Startup rates by distance from father's industry") //msymbol(o) //  //scheme(economist) msize(*.25)
graph export freq_graph_binscatter_DEAD.png, replace as(png)
}

log close
